(see “TCR_concordance.html” from 9/10/2021 stored in TCR_Concordance Dropbox)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
tcr1<- read_tsv("/Users/jen/Dropbox/TCR_Concordance/concord_data/E4412_1A_TIL_CONTROL_DNA.tsv", guess_max = 1000000)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## sku = col_logical(),
## test_name = col_logical(),
## sample_catalog_tags = col_logical(),
## sample_rich_tags = col_logical(),
## sample_rich_tags_json = col_logical(),
## hla_class_i = col_logical(),
## hla_class_ii = col_logical(),
## kit_control = col_logical(),
## total_templates = col_double(),
## productive_templates = col_double(),
## outofframe_templates = col_double(),
## stop_templates = col_double(),
## dj_templates = col_double(),
## total_rearrangements = col_double(),
## productive_rearrangements = col_double(),
## outofframe_rearrangements = col_double(),
## stop_rearrangements = col_double(),
## dj_rearrangements = col_double(),
## total_reads = col_logical(),
## total_productive_reads = col_logical()
## # ... with 51 more columns
## )
## ℹ Use `spec()` for the full column specifications.
tcr2<- read_tsv("/Users/jen/Dropbox/TCR_Concordance/concord_data/E4412_1B_TIL_CONTROL.tsv", guess_max = 1000000)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## sku = col_logical(),
## test_name = col_logical(),
## sample_catalog_tags = col_logical(),
## sample_rich_tags = col_logical(),
## sample_rich_tags_json = col_logical(),
## hla_class_i = col_logical(),
## hla_class_ii = col_logical(),
## kit_control = col_logical(),
## total_templates = col_double(),
## productive_templates = col_double(),
## outofframe_templates = col_double(),
## stop_templates = col_double(),
## dj_templates = col_double(),
## total_rearrangements = col_double(),
## productive_rearrangements = col_double(),
## outofframe_rearrangements = col_double(),
## stop_rearrangements = col_double(),
## dj_rearrangements = col_double(),
## total_reads = col_logical(),
## total_productive_reads = col_logical()
## # ... with 50 more columns
## )
## ℹ Use `spec()` for the full column specifications.
dim(tcr1)
## [1] 1652 118
dim(tcr2)
## [1] 1936 118
colnames(tcr1)
## [1] "sample_name"
## [2] "species"
## [3] "locus"
## [4] "product_subtype"
## [5] "kit_pool"
## [6] "sku"
## [7] "test_name"
## [8] "sample_catalog_tags"
## [9] "sample_rich_tags"
## [10] "sample_rich_tags_json"
## [11] "hla_class_i"
## [12] "hla_class_ii"
## [13] "kit_control"
## [14] "total_templates"
## [15] "productive_templates"
## [16] "outofframe_templates"
## [17] "stop_templates"
## [18] "dj_templates"
## [19] "total_rearrangements"
## [20] "productive_rearrangements"
## [21] "outofframe_rearrangements"
## [22] "stop_rearrangements"
## [23] "dj_rearrangements"
## [24] "total_reads"
## [25] "total_productive_reads"
## [26] "total_outofframe_reads"
## [27] "total_stop_reads"
## [28] "total_dj_reads"
## [29] "productive_simpson_clonality"
## [30] "productive_clonality"
## [31] "productive_entropy"
## [32] "sample_simpson_clonality"
## [33] "sample_clonality"
## [34] "sample_entropy"
## [35] "sample_amount_ng"
## [36] "sample_cells_mass_estimate"
## [37] "fraction_productive_of_cells_mass_estimate"
## [38] "sample_cells"
## [39] "fraction_productive_of_cells"
## [40] "max_productive_frequency"
## [41] "max_frequency"
## [42] "counting_method"
## [43] "primer_set"
## [44] "sequence_result_status"
## [45] "release_date"
## [46] "upload_date"
## [47] "sample_tags"
## [48] "fraction_productive"
## [49] "order_name"
## [50] "kit_id"
## [51] "total_t_cells"
## [52] "total_templates_agg"
## [53] "rearrangement"
## [54] "amino_acid"
## [55] "frame_type"
## [56] "rearrangement_type"
## [57] "templates"
## [58] "seq_reads"
## [59] "frequency"
## [60] "productive_frequency"
## [61] "cdr3_length"
## [62] "v_family"
## [63] "v_gene"
## [64] "v_allele"
## [65] "d_family"
## [66] "d_gene"
## [67] "d_allele"
## [68] "j_family"
## [69] "j_gene"
## [70] "j_allele"
## [71] "v_deletions"
## [72] "d5_deletions"
## [73] "d3_deletions"
## [74] "j_deletions"
## [75] "n2_insertions"
## [76] "n1_insertions"
## [77] "v_index"
## [78] "n1_index"
## [79] "n2_index"
## [80] "d_index"
## [81] "j_index"
## [82] "v_family_ties"
## [83] "v_gene_ties"
## [84] "v_allele_ties"
## [85] "d_family_ties"
## [86] "d_gene_ties"
## [87] "d_allele_ties"
## [88] "j_family_ties"
## [89] "j_gene_ties"
## [90] "j_allele_ties"
## [91] "sequence_tags"
## [92] "v_shm_count"
## [93] "v_shm_indexes"
## [94] "antibody"
## [95] "bio_identity"
## [96] "rearrangement_trunc"
## [97] "v_resolved"
## [98] "d_resolved"
## [99] "j_resolved"
## [100] "extended_rearrangement"
## [101] "cdr1_rearrangement"
## [102] "cdr1_amino_acid"
## [103] "cdr1_start_index"
## [104] "cdr1_rearrangement_length"
## [105] "cdr2_rearrangement"
## [106] "cdr2_amino_acid"
## [107] "cdr2_start_index"
## [108] "cdr2_rearrangement_length"
## [109] "cdr3_rearrangement"
## [110] "cdr3_amino_acid"
## [111] "cdr3_start_index"
## [112] "cdr3_rearrangement_length"
## [113] "chosen_v_family"
## [114] "chosen_v_gene"
## [115] "chosen_v_allele"
## [116] "chosen_j_family"
## [117] "chosen_j_gene"
## [118] "chosen_j_allele"
tcr1<- tcr1 %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency)) %>%
mutate(sample = "1A")
tcr2<- tcr2 %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency)) %>%
mutate(sample = "1B")
dim(tcr1)
## [1] 1282 6
dim(tcr2)
## [1] 1361 6
tcr1_2<- inner_join(tcr1, tcr2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid"))
ggplot(tcr1_2, aes(x=productive_frequency.x, y= productive_frequency.y)) +
geom_point()+
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14) +
ggtitle("correlation of productive frequency of all overlapping clones")
cor(tcr1_2$productive_frequency.x, tcr1_2$productive_frequency.y)
## [1] 0.9991812
## check the productive frequecy distribution. most of them are below 0.1%
ggplot(tcr1, aes(x= productive_frequency))+
geom_histogram(bins=100) +
scale_x_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14)
## 99% of the clones frequency < 0.01
mean(tcr1$productive_frequency <0.01)
## [1] 0.9914197
## 82 clones have frequency > 0.001, so, checking top 50 or top 100 clones makes sense.
sum(tcr1$productive_frequency > 0.001)
## [1] 82
## 76 clones
sum(tcr2$productive_frequency > 0.001)
## [1] 76
### Check the correlation for the top 50 clones
## there are ties, so 53 clones were selected
tcr1_top50<- tcr1 %>%
slice_max(order_by = productive_frequency, n = 50, with_ties = TRUE)
tcr1_2_top50<- inner_join(tcr1_top50, tcr2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid"))
## 53 clones appear in both 1A and 1B samples!
dim(tcr1_2_top50)
## [1] 53 9
ggplot(tcr1_2_top50, aes(x=productive_frequency.x, y= productive_frequency.y)) +
geom_point()+
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14) +
ggtitle("correlation of productive frequency of top 50 clones")
cor(tcr1_2_top50$productive_frequency.x, tcr1_2_top50$productive_frequency.y)
## [1] 0.9991644
library(tidyverse)
library(dplyr)
library(ggplot2)
#set input directory
indir<-"/Users/jen/Dropbox/TCR_Concordance/concord_data"
outdir<-"/Users/jen/Dropbox/TCR_Concordance/concord_output/2021_09_14/"
#create read table function that can accept a variable as a table name
read_tcr<-function(k){
tcr_table<-read.table(k,header=T,sep="\t")
tcr_table<-tcr_table %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency))
return(tcr_table)
}
# get list of filenames from input directory
ff <- list.files(path=indir, full.names=TRUE)
# apply read_tcr function to list of files, add file names to each list member
myfilelist <- lapply(ff, read_tcr)
names(myfilelist)<-list.files(path=indir, full.names=FALSE)
#create separate file list with top 50
read_tcr_top<-function(s){
tcr_table_50<-read.table(s,header=T,sep="\t")
tcr_table_50<-tcr_table_50 %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency)) %>%
slice_max(order_by = productive_frequency, n = 50, with_ties = TRUE)
return(tcr_table_50)
}
#create separate file list with top 100
read_tcr_top100<-function(s){
tcr_table_100<-read.table(s,header=T,sep="\t")
tcr_table_100<-tcr_table_100 %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency)) %>%
slice_max(order_by = productive_frequency, n = 100, with_ties = TRUE)
return(tcr_table_100)
}
#create separate file list with top 500
read_tcr_top500<-function(s){
tcr_table_500<-read.table(s,header=T,sep="\t")
tcr_table_500<-tcr_table_500 %>%
select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
filter(!is.na(productive_frequency)) %>%
slice_max(order_by = productive_frequency, n = 500, with_ties = TRUE)
return(tcr_table_500)
}
myfilelist_top50<- lapply(ff,read_tcr_top)
names(myfilelist_top50)<-list.files(path=indir, full.names=FALSE)
myfilelist_top100<-lapply(ff,read_tcr_top100)
names(myfilelist_top100)<-list.files(path=indir, full.names=FALSE)
myfilelist_top500<-lapply(ff,read_tcr_top500)
names(myfilelist_top500)<-list.files(path=indir, full.names=FALSE)
########################################
##### Productive Frequency Ranges ######
########################################
NameList<-list.files(path=indir, full.names=FALSE)
# get ranges of productive frequencies
prod_freq_matrix<-matrix(nrow=length(myfilelist),ncol=12)
rownames(prod_freq_matrix)<-NameList
colnames(prod_freq_matrix)<-c("all_clones_min","all_clone_max","all_clones_mean","top500_min","top500_max","top500_mean","top100_min","top100_max","top100_mean", "top50_min","top50_max","top50_mean")
## range for all clones
for (i in 1:length(myfilelist)){
prod_freq_matrix[i,1:3]<-c(range(myfilelist[[i]]$productive_frequency),mean(myfilelist[[i]]$productive_frequency))
prod_freq_matrix[i,4:6]<-c(range(myfilelist_top500[[i]]$productive_frequency),mean(myfilelist_top500[[i]]$productive_frequency))
prod_freq_matrix[i,7:9]<-c(range(myfilelist_top100[[i]]$productive_frequency),mean(myfilelist_top100[[i]]$productive_frequency))
prod_freq_matrix[i,10:12]<-c(range(myfilelist_top50[[i]]$productive_frequency),mean(myfilelist_top50[[i]]$productive_frequency))
}
# print productive frequency summary table
prod_freq_matrix %>%
DT::datatable()
In the initial analysis, it seemed as if including the d_gene in the inner_join resulted in a lot of lost matches. Double checking…
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
#create inner_join function that include d gene
merge_tcrs_withd<-function(dd1, dd2){
merged_files_d<-inner_join(dd1, dd2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid","d_gene"="d_gene"))
return(merged_files_d)
}
#make empty lists for plots and correlations
myplots_withd<-list()
mycorrs_withd<-list()
mydims_withd<-list()
#run loop such that i=top 50 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top50 clones of file 1 to
# all clones in file 2
for (i in 1:length(myfilelist)){
#create plot and correlation list for file i within myplot_list
myplots_withd[[i]]<-list()
mycorrs_withd[[i]]<-list()
mydims_withd[[i]]<-list()
for (h in 1:length(myfilelist)){
if (i==h) next
tcr_i_h_withd<-merge_tcrs_withd(myfilelist_top50[[i]],myfilelist[[h]])
p_withd<-ggplot(tcr_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
geom_point()+
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14) +
ggtitle("correlation of productive frequency of top 50 clones with d genes")
correlation_withd<-cor(tcr_i_h_withd$productive_frequency.x, tcr_i_h_withd$productive_frequency.y)
dimension50_withd<-dim(tcr_i_h_withd)
myplots_withd[[i]][[h]]<-p_withd
mycorrs_withd[[i]][[h]]<-correlation_withd
mydims_withd[[i]][[h]]<-dimension50_withd
}
}
names(myplots_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims_withd)<-list.files(path=indir, full.names=FALSE)
# function to display all correlations and plots.
# will also write to file
disp_plots_corr_withd<-function(file1,file2){
cat(paste("Top 50 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
cat(paste("Pearson Correlation = ",mycorrs_withd[[file1]][[file2]]))
print(myplots_withd[[file1]][[file2]])
#print plots to file
png_file<-paste(outdir,"top50",NameList[file1],"_",NameList[file2],"withD.png",sep="")
png(file=png_file,width=600, height=350)
print(myplots_withd[[file1]][[file2]])
dev.off()
}
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h) next
disp_plots_corr_withd(i,h)
}
}
## Top 50 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999166262046191
## Top 50 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.99888032257963
## Top 50 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998998416207627
## Top 50 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997190953292622
## Top 50 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996244044548813
## Top 50 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.999159265145344
## Top 50 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.999164435857467
## Top 50 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999261819151957
## Top 50 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.99671984709148
## Top 50 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.997875729695303
## Top 50 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.998879745493858
## Top 50 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999168240179539
## Top 50 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999406679771663
## Top 50 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997272809748641
## Top 50 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.997806117133848
## Top 50 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.999015320734871
## Top 50 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999293924043519
## Top 50 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.999447523996795
## Top 50 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.998375106963167
## Top 50 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998260342740857
## Top 50 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.997215372056811
## Top 50 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.99677527545078
## Top 50 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.997295454351495
## Top 50 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998410838846518
## Top 50 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996442387751591
## Top 50 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.996315685425047
## Top 50 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.997973590995711
## Top 50 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.997916899185321
## Top 50 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998363528949173
## Top 50 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.996453863578923
#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mycorrs_mx_withd[i,h]<-NA
}else{
mycorrs_mx_withd[i,h]<-mycorrs_withd[[i]][[h]]}
}
}
rownames(mycorrs_mx_withd)<-NameList
colnames(mycorrs_mx_withd)<-NameList
#prepare correlation matrix for heatmap
melted_corr_withd <- melt(mycorrs_mx_withd)
#plot correlation heatmap
ggplot(data = melted_corr_withd, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
ggtitle("Pearson correlations for top 50 clones, d genes included") +
xlab("") +
ylab("")
# Heatmap not very informative, maybe print table?
mycorrs_mx_withd %>%
DT::datatable()
##############################################
### Repeat analysis with top 100 clones #####
##############################################
#make empty lists for plots and correlations
myplots100_withd<-list()
mycorrs100_withd<-list()
mydims100_withd<-list()
#run loop such that i=top 100 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top100 clones of file 1 to
# all clones in file 2
for (i in 1:length(myfilelist)){
#create plot and correlation list for file i within myplot_list
myplots100_withd[[i]]<-list()
mycorrs100_withd[[i]]<-list()
mydims100_withd[[i]]<-list()
for (h in 1:length(myfilelist)){
if (i==h) next
tcr100_i_h_withd<-merge_tcrs_withd(myfilelist_top100[[i]],myfilelist[[h]])
p100_withd<-ggplot(tcr100_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
geom_point()+
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14) +
ggtitle("correlation of productive frequency of top 100 clones with d genes")
correlation100_withd<-cor(tcr100_i_h_withd$productive_frequency.x, tcr100_i_h_withd$productive_frequency.y)
dimensions100_withd<-dim(tcr100_i_h_withd)
myplots100_withd[[i]][[h]]<-p100_withd
mycorrs100_withd[[i]][[h]]<-correlation100_withd
mydims100_withd[[i]][[h]]<-dimensions100_withd
}
}
names(myplots100_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims100_withd)<-list.files(path=indir, full.names=FALSE)
# function to display all correlations and plots for top100.
# will also write to file
disp_plots_corr100_withd<-function(file1,file2){
cat(paste("Top 100 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
cat(paste("Pearson Correlation = ",mycorrs100_withd[[file1]][[file2]]))
print(myplots100_withd[[file1]][[file2]])
#print plots to file
png_file<-paste(outdir,"top100",NameList[file1],"_",NameList[file2],"withD.png",sep="")
png(file=png_file,width=600, height=350)
print(myplots100_withd[[file1]][[file2]])
dev.off()
}
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h) next
disp_plots_corr100_withd(i,h)
}
}
## Top 100 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999178075235607
## Top 100 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.998916855799233
## Top 100 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998988330114623
## Top 100 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997163144858478
## Top 100 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996412284959479
## Top 100 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.999164027698048
## Top 100 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.999176761532961
## Top 100 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999179611958061
## Top 100 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.996639031611959
## Top 100 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.997941597424477
## Top 100 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.99891571360525
## Top 100 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999182005603366
## Top 100 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999372729041565
## Top 100 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997237233718641
## Top 100 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.99789992542615
## Top 100 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.999003488597187
## Top 100 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999225057234736
## Top 100 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.999402859140985
## Top 100 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.998417367243602
## Top 100 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998263313735027
## Top 100 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.997182377273628
## Top 100 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.996685550370382
## Top 100 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.997263526038884
## Top 100 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998425379632029
## Top 100 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996413523293684
## Top 100 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.996474185922898
## Top 100 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.998047600035829
## Top 100 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.997991239488617
## Top 100 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998373582175545
## Top 100 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.996444174142193
#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs100_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mycorrs100_mx_withd[i,h]<-NA
}else{
mycorrs100_mx_withd[i,h]<-mycorrs100_withd[[i]][[h]]}
}
}
rownames(mycorrs100_mx_withd)<-NameList
colnames(mycorrs100_mx_withd)<-NameList
#prepare correlation matrix for heatmap
melted_corr100_withd <- melt(mycorrs100_mx_withd)
#plot correlation heatmap
ggplot(data = melted_corr100_withd, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
ggtitle("Pearson correlations for top 100 clones with d genes") +
xlab("") +
ylab("")
# Heatmap not very informative, maybe print table?
mycorrs100_mx_withd %>%
DT::datatable()
##############################################
### Repeat analysis with top 500 clones #####
##############################################
#make empty lists for plots and correlations
myplots500_withd<-list()
mycorrs500_withd<-list()
mydims500_withd<-list()
#run loop such that i=top 500 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top500 clones of file 1 to
# all clones in file 2
for (i in 1:length(myfilelist)){
#create plot and correlation list for file i within myplot_list
myplots500_withd[[i]]<-list()
mycorrs500_withd[[i]]<-list()
mydims500_withd[[i]]<-list()
for (h in 1:length(myfilelist)){
if (i==h) next
tcr500_i_h_withd<-merge_tcrs_withd(myfilelist_top500[[i]],myfilelist[[h]])
p500_withd<-ggplot(tcr500_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
geom_point()+
scale_x_log10() +
scale_y_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
theme_classic(base_size = 14) +
ggtitle("correlation of productive frequency of top 500 clones with d genes")
correlation500_withd<-cor(tcr500_i_h_withd$productive_frequency.x, tcr500_i_h_withd$productive_frequency.y)
dimensions500_withd<-dim(tcr500_i_h_withd)
myplots500_withd[[i]][[h]]<-p500_withd
mycorrs500_withd[[i]][[h]]<-correlation500_withd
mydims500_withd[[i]][[h]]<-dimensions500_withd
}
}
names(myplots500_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims500_withd)<-list.files(path=indir, full.names=FALSE)
# function to display all correlations and plots for top100.
# will also write to file
disp_plots_corr500_withd<-function(file1,file2){
cat(paste("Top 500 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
cat(paste("Pearson Correlation = ",mycorrs500_withd[[file1]][[file2]]))
print(myplots500_withd[[file1]][[file2]])
#print plots to file
png_file<-paste(outdir,"top500",NameList[file1],"_",NameList[file2],"withD.png",sep="")
png(file=png_file,width=600, height=350)
print(myplots500_withd[[file1]][[file2]])
dev.off()
}
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h) next
disp_plots_corr500_withd(i,h)
}
}
## Top 500 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999186870841341
## Top 500 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.99894784452698
## Top 500 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998980181330736
## Top 500 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997173541144413
## Top 500 clones of DFCI_20210908_DNAreference.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996551409195004
## Top 500 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.99917485861993
## Top 500 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.999181521111352
## Top 500 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999120478472191
## Top 500 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.996615386026278
## Top 500 clones of E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998003971636869
## Top 500 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.998941719464135
## Top 500 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999185288329041
## Top 500 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.999330959505323
## Top 500 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.997221595028865
## Top 500 clones of E4412_1B_TIL_CONTROL.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.997960625495892
## Top 500 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.998992763082061
## Top 500 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.999150147626278
## Top 500 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.99935016017218
## Top 500 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.998451306022531
## Top 500 clones of E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998245210457028
## Top 500 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.997184626548416
## Top 500 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.996633877534219
## Top 500 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.997239336763174
## Top 500 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.99845148024227
## Top 500 clones of E4412_2B_CONTROLDNA2.tsv compared to clones of E4412_3_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.996392082408524
## Top 500 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of DFCI_20210908_DNAreference.tsv with d genes
## Pearson Correlation = 0.996614269322668
## Top 500 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1A_TIL_CONTROL_DNA.tsv with d genes
## Pearson Correlation = 0.998103238434014
## Top 500 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_1B_TIL_CONTROL.tsv with d genes
## Pearson Correlation = 0.998060457007807
## Top 500 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2A_TIL_CONTROL_DNA2.tsv with d genes
## Pearson Correlation = 0.998373605841485
## Top 500 clones of E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of E4412_2B_CONTROLDNA2.tsv with d genes
## Pearson Correlation = 0.996468410215163
#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs500_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mycorrs500_mx_withd[i,h]<-NA
}else{
mycorrs500_mx_withd[i,h]<-mycorrs500_withd[[i]][[h]]}
}
}
rownames(mycorrs500_mx_withd)<-NameList
colnames(mycorrs500_mx_withd)<-NameList
#prepare correlation matrix for heatmap
melted_corr500_withd <- melt(mycorrs500_mx_withd)
#plot correlation heatmap
ggplot(data = melted_corr500_withd, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
ggtitle("Pearson correlations for top 500 clones with d genes") +
xlab("") +
ylab("")
# Heatmap not very informative, maybe print table?
mycorrs500_mx_withd %>%
DT::datatable()
######### print dimensions to evaluate if some clones are being lost with the inner join
#make matrix from mydims such that rows = top clones data and cols = all clones datas
# for top50 with d genes
mydims_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mydims_mx_withd[i,h]<-NA
}else{
mydims_mx_withd[i,h]<-mydims_withd[[i]][[h]][[1]]}
}
}
rownames(mydims_mx_withd)<-NameList
colnames(mydims_mx_withd)<-NameList
#print matrix
print("Total number of clones after merge. Rows = Top 50 clone file.")
## [1] "Total number of clones after merge. Rows = Top 50 clone file."
mydims_mx_withd %>%
DT::datatable()
# for top100 with d genes
mydims100_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mydims100_mx_withd[i,h]<-NA
}else{
mydims100_mx_withd[i,h]<-mydims100_withd[[i]][[h]][[1]]}
}
}
rownames(mydims100_mx_withd)<-NameList
colnames(mydims100_mx_withd)<-NameList
#print matrix
print("Total number of clones after merge. Rows = Top 100 clone file.")
## [1] "Total number of clones after merge. Rows = Top 100 clone file."
mydims100_mx_withd %>%
DT::datatable()
# for top500 with d genes
mydims500_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
for (h in 1:length(myfilelist)){
if (i==h){
mydims500_mx_withd[i,h]<-NA
}else{
mydims500_mx_withd[i,h]<-mydims500_withd[[i]][[h]][[1]]}
}
}
rownames(mydims500_mx_withd)<-NameList
colnames(mydims500_mx_withd)<-NameList
#print matrix
print("Total number of clones after merge. Rows = Top 500 clone file.")
## [1] "Total number of clones after merge. Rows = Top 500 clone file."
mydims500_mx_withd %>%
DT::datatable()
Including D genes does not seem significantly affect the results. Marking earlier code as eval=FALSE. Will knit with just with d genes code evaluated.
#calculate mean productive frequencies for all files
top50_clone_freq_mean<-mean(prod_freq_matrix[,"top50_mean"])
top100_clone_freq_mean<-mean(prod_freq_matrix[,"top100_mean"])
top500_clone_freq_mean<-mean(prod_freq_matrix[,"top500_mean"])
all_clone_freq_mean<-mean(prod_freq_matrix[,"all_clones_mean"])
# calculate fraction of clones with freq <0.001
top50_freq_one_tenth<-sum(myfilelist_top50[[1]]$productive_frequency <0.001)/sum(myfilelist_top50[[1]]$productive_frequency>0)
top100_freq_one_tenth<-sum(myfilelist_top100[[1]]$productive_frequency <0.001)/sum(myfilelist_top100[[1]]$productive_frequency>0)
top500_freq_one_tenth<-sum(myfilelist_top500[[1]]$productive_frequency <0.001)/sum(myfilelist_top500[[1]]$productive_frequency>0)
all_freq_one_tenth<-sum(myfilelist[[1]]$productive_frequency <0.001)/sum(myfilelist[[1]]$productive_frequency>0)
# calculate fraction of clones with freq <0.01
top50_freq_one<-sum(myfilelist_top50[[1]]$productive_frequency<0.01)/sum(myfilelist_top50[[1]]$productive_frequency>0)
top100_freq_one<-sum(myfilelist_top100[[1]]$productive_frequency<0.01)/sum(myfilelist_top100[[1]]$productive_frequency>0)
top500_freq_one<-sum(myfilelist_top500[[1]]$productive_frequency<0.01)/sum(myfilelist_top500[[1]]$productive_frequency>0)
all_freq_one<-sum(myfilelist[[1]]$productive_frequency<0.01)/sum(myfilelist[[1]]$productive_frequency>0)
# make table of original file lengths
total_clones<-matrix(nrow=length(myfilelist),ncol=1)
for (i in 1:length(myfilelist)){
total_clones[i,1]<-dim(myfilelist[[i]])[[1]]
}
rownames(total_clones)<- NameList
colnames(total_clones)<-c("Total Clones")
#print matrix
print("Total clones in each file")
## [1] "Total clones in each file"
total_clones %>%
DT::datatable()
Sept 16,2021 # Make a histogram of the productive frequencies of each sample
#create empty data frame
prod_freq_df<-data.frame()
# add productive frequencies to the dataframe
for (i in 1:length(myfilelist)){
#make a dataframe from the ith productive frequency numbers
temp_df<-data.frame(productive_frequency=myfilelist[[i]]$productive_frequency)
#add info about where data came from
temp_df$fileName <- names(myfilelist[i])
# merge dataframes to make one large dataframe
prod_freq_df<- rbind(prod_freq_df,temp_df)
}
ggplot(prod_freq_df, aes(productive_frequency, fill = fileName)) +
geom_histogram(alpha = 0.5, bins=100, position = 'identity') +
scale_x_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2) +
theme_classic(base_size = 14)
ggplot(prod_freq_df, aes(productive_frequency, fill = fileName)) +
geom_histogram(alpha = 0.5, bins=100) +
scale_x_log10() +
geom_abline(slope = 1, intercept = 0, linetype=2) +
theme_classic(base_size = 14)